My selling rules need to be based on ML -- maybe something I run each day that predicts whether or not it will hit my goal considering I have one more day of data. each day something will be run to predict probability of still hitting target. if it becomes less than 50%, sell.
In [211]:
cat('Last run: ', "\n", format(Sys.time(), "%a %b %d %X %Y"))
In [ ]:
pacman::p_load(feather, tidyverse, tabplot, titanic, devtools, caret, gmodels, lubridate,
ggjoy, Amelia, ggvis, rattle, corrplot, caretEnsemble, GGally, quantmod, TTR, sqldf, tidyquant,
PerformanceAnalytics, dygraphs, magrittr
, install = TRUE, update = getOption("pac_update"), character.only = FALSE)
pacman::p_load_gh("clauswilke/ggjoy","rstats-db/odbc","rstudio/rmarkdown",'IRkernel/IRkernel' ) #"business-science/tidyquant"
IRkernel::installspec() # to register the kernel in the current R installation
options(warn=-1)
In [213]:
stdf <- tidyquant::tq_get("QQQ", get = "stock.prices", from = "2011-01-01") #GLD JJC UGA JJN SLV SPXL
In [214]:
head(stdf)
str(stdf)
tail(stdf)
In [215]:
# Create my target variable
leadn <- 16
stdf <- mutate(stdf,
#rolling window max close price for last n days
runMaxHigh = runMax(stdf[,c("high")], n = leadn, cumulative = F),
#what will be the high in the next 15 days?
Max15 = lead(runMaxHigh, n=leadn, order_by = date),
# running min low over last n days
runMinLow = runMin(stdf[,c("low")], n = leadn, cumulative = F),
#what will be the low in the next 15 days?
Min15 = lead(runMinLow, n=leadn, order_by = date),
#find future prices and their percent gains
lead15 = lead(high, n=leadn, order_by = date),
# Percent gain or loss #(Price Sold - Purchase Price)/(Purchase Price)
percentchange15 = round((100 * (lead15 - high)/(high)),2),
# create open price tomorrow, because that's what I'll be buying at
openpricetomorrow = lead(open, n=1, order_by = date),
#if i buy today at tomorrow's open, will the price jump n% where I can sell it in the next 15 days
#max15 as a percentage gain from Close price 15 days ago
percentchangeMax15 = round((100 * (Max15 - openpricetomorrow)/(openpricetomorrow)),2),
# create target
label_twopercent = case_when(
percentchangeMax15 < 2 ~ "class0",
percentchangeMax15 >= 2 ~ "class1"
)
) #end mutate
#convert target to factor
stdf$label_twopercent %<>% factor
# for class probabilities, have to change Class values with prefix "class". this prevents error
# and sets up for final results like i need them for the competition
#stdf$label_twopercent <- sub("^", "class", stdf$label_twopercent)
glimpse(stdf)
In [216]:
# create subset of all columns except leading. i don't want them leaking future data
stdf <- dplyr::select(stdf, -dplyr::one_of(
c("percentchangeMax15","openpricetomorrow","lead15","Max15","Min15","percentchange15")
))
glimpse(stdf)
In [217]:
#stdf$label_twopercent <- ifelse(stdf$percentchangeMax15 >= 2.0, 1,0) #will the price jump up 6% sometime withing the next 3 weeks
#https://www.rdocumentation.org/packages/dplyr/versions/0.7.2/topics/case_when
#stdf$label_twopercent <- case_when(
#stdf$percentchangeMax15 < -2 ~ -2,
# stdf$percentchangeMax15 < 0 & stdf$percentchangeMax15 >= -2 ~ -1,
# stdf$percentchangeMax15 >=0 & stdf$percentchangeMax15 < 1 ~ 0,
#stdf$percentchangeMax15 >= 1 & stdf$percentchangeMax15 < 2 ~ 1,
#stdf$percentchangeMax15 >= 2 & stdf$percentchangeMax15 < 3 ~ 2,
#stdf$percentchangeMax15 >= 3 & stdf$percentchangeMax15 < 4 ~ 3,
# stdf$percentchangeMax15 >= 4 & stdf$percentchangeMax15 < 5 ~ 4,
# stdf$percentchangeMax15 >= 5 & stdf$percentchangeMax15 < 6 ~ 5,
# stdf$percentchangeMax15 >= 6 & stdf$percentchangeMax15 < 7 ~ 6,
# stdf$percentchangeMax15 >= 7 & stdf$percentchangeMax15 < 8 ~ 7,
# stdf$percentchangeMax15 >= 8 ~ 8
#stdf$percentchangeMax15 < 2 ~ 0,
#stdf$percentchangeMax15 >= 2 ~ 1
#)
#stdf$label_twopercent <- as.factor(stdf$label_twopercent)
In [218]:
# target is na for the last n observations. i'll separate those after creating indicators and at the very end predict on them
In [219]:
#create predictors
stdf <- mutate(stdf,
#create date features
dayofweek = wday(date),
dayofmonth = mday(date),
dayofyear = yday(date),
weekofyear = week(date),
monthofyear = month(date),
year = year(date),
#EMA's
EMA200 = TTR::EMA(high, n=200, ),
EMA100 = TTR::EMA(high, n=100, ),
EMA50 = TTR::EMA(high, n=50, ),
EMA20 = TTR::EMA(high, n=20, ),
EMA10 = TTR::EMA(high, n=10, ),
EMA5 = TTR::EMA(high, n=5, ),
# price over moving avg?
PoM200 = ifelse(high >= EMA200, 1, 0),
PoM100 = ifelse(high >= EMA100, 1, 0),
PoM50 = ifelse(high >= EMA50, 1, 0),
PoM20 = ifelse(high >= EMA20, 1, 0),
PoM10 = ifelse(high >= EMA10, 1, 0),
PoM5 = ifelse(high >= EMA5, 1, 0),
# moving avg over moving avg?
#if 100 over 200
MoM100200 = ifelse(EMA100 >= EMA200, 1, 0),
#if 50 over 200
MoM50200 = ifelse(EMA50 >= EMA200, 1, 0),
#if 20 over 200
MoM20200 = ifelse(EMA20 >= EMA200, 1, 0),
#if 20 over 50
MoM50 = ifelse(EMA20 >= EMA50, 1, 0),
#if 10 over 20
MoM20 = ifelse(EMA10 >= EMA20, 1, 0),
#if 5 over 10
MoM10 = ifelse(EMA5 >= EMA10, 1, 0),
#if 5 over 50
MoM550 = ifelse(EMA5 >= EMA50, 1, 0),
#if 5 over 20
MoM520 = ifelse(EMA5 >= EMA20, 1, 0),
#slope EMA 200
lagEMA200 = lag(EMA200, n=1, order_by = date), #only used to calc slope
EMA200Slope = (EMA200 - lagEMA200), # slope number
EMA200SlopePN = ifelse(EMA200Slope > 0, 1,0), #says whether slope is positive or negative
lagEMA200SlopePN = lag(EMA200Slope, n=1, order_by = date), #used just for slope change pos to neg
EMA200SlopePNchange = ifelse(EMA200SlopePN > lagEMA200SlopePN, 1,0), #says whether slope changed from negative to positive. 1 if slope just changed positive
#slope EMA 50
lagEMA50 = lag(EMA50, n=1, order_by = date), #only used to calc slope
EMA50Slope = (EMA50 - lagEMA50), # slope number
EMA50SlopePN = ifelse(EMA50Slope > 0, 1,0), #says whether slope is positive or negative
lagEMA50SlopePN = lag(EMA50Slope, n=1, order_by = date), #used just for slope change pos to neg
EMA50SlopePNchange = ifelse(EMA50SlopePN > lagEMA50SlopePN, 1,0), #says whether slope changed from negative to positive. 1 if slope just changed positive
#slope EMA 20
lagEMA20 = lag(EMA20, n=1, order_by = date), #only used to calc slope
EMA20Slope = (EMA20 - lagEMA20), # slope number
EMA20SlopePN = ifelse(EMA20Slope > 0, 1,0), #says whether slope is positive or negative
lagEMA20SlopePN = lag(EMA20Slope, n=1, order_by = date), #used just for slope change pos to neg
EMA20SlopePNchange = ifelse(EMA20SlopePN > lagEMA20SlopePN, 1,0), #says whether slope changed from negative to positive. 1 if slope just changed positive
#slope EMA 10
lagEMA10 = lag(EMA10, n=1, order_by = date), #only used to calc slope
EMA10Slope = (EMA10 - lagEMA10), # slope number
EMA10SlopePN = ifelse(EMA10Slope > 0, 1,0), #says whether slope is positive or negative
lagEMA10SlopePN = lag(EMA10Slope, n=1, order_by = date), #used just for slope change pos to neg
EMA10SlopePNchange = ifelse(EMA10SlopePN > lagEMA10SlopePN, 1,0), #says whether slope changed from negative to positive. 1 if slope just changed positive
# new high in past n days
# if newhighndays is 1, then stock has reached a new high in last n days. if 0 it hasn't.
#rolling window max high price for last n days
runMaxHighndays = runMax(high, n = 15, cumulative = F),
#if today's high is greater than yesterday's running max high, it's a new high
newhighYN = ifelse(high > lag(runMaxHighndays, n=1, order_by = date), 1,0),
#if sum running max is > 0, it's reached a new high in the last n days
newhighndays = runMax(newhighYN, n = 15, cumulative = F),
# new low in past n days
#rolling window max low price for last n days
runMinLowndays = runMin(low, n = 15, cumulative = F),
#if today's high is greater than yesterday's running max high, it's a new high
newlowYN = ifelse(low < lag(runMinLowndays, n=1, order_by = date), 1,0),
#if sum running max is > 0, it's reached a new high in the last n days
newlowndays = runMin(newlowYN, n = 15, cumulative = F),
#Rate of Change (ROC)
roc = ROC(high, n=10, type = c("continuous"), na.pad = TRUE),
#RSI
rsi = RSI(high, n=14),
#On-balance volume (OBV)
obv = OBV(high, volume),
# volatility
volatility = volatility(high, n = 10, calc = "close", N = 260, mean0 = FALSE)
)
glimpse(stdf)
tail(stdf$newhighndays)
tail(stdf$newlowndays)
In [220]:
# I can leave the date features numeric but need to create char versions with dummies also
#create char versions
#stdf$c_dayofweek = as.character(stdf$dayofweek)
#stdf$c_dayofmonth = as.character(stdf$dayofmonth)
#stdf$c_dayofyear = as.character(stdf$dayofyear)
#stdf$c_weekofyear = as.character(stdf$weekofyear)
#stdf$c_monthofyear = as.character(stdf$monthofyear)
#stdf$c_year = as.character(stdf$year)
#create dummy variables
#dmy <- caret::dummyVars(" ~ c_dayofweek + c_dayofmonth + c_weekofyear + c_monthofyear + c_year + c_dayofyear", data = stdf, fullRank = T)
#dataset_d <- data.frame(predict(dmy, newdata = stdf))
# recombine data
#stdf <- cbind(stdf, dataset_d)
#dim(stdf)
#glimpse(stdf)
In [221]:
#these indicators create multiple columns
#MACD. create predictors macd and signal
macd <- TTR::MACD(stdf$high, 12, 26, 9, maType="EMA" )
stdf <- cbind(stdf, macd)
stdf <- mutate(stdf,
macddir = ifelse(macd > signal, 1, 0)
)
#Aroon
aroon <- aroon( stdf[,c("high", "low")], n=20 )
stdf <- cbind(stdf, aroon)
stdf <- mutate(stdf,
aroondir = ifelse(aroonUp > aroonDn, 1, 0)
)
glimpse(stdf)
In [222]:
#add rolling recent high feature
#add crossovers of price and ema's and ema's crossing themselfves
#crossover of macd and its signal
In [223]:
visdat::vis_miss(stdf, cluster = T, sort_miss = T)
In [224]:
# create a numeric dataset
n_dataset <- dplyr::select_if(stdf, is.numeric)
#psych::pairs.panels(dplyr::select(n_dataset, 1:10), pch = ".", hist.col = "darkgreen", ellipse = F, lm = T)
#psych::pairs.panels(dplyr::select(n_dataset, 11:20), pch = ".", hist.col = "darkgreen", ellipse = F, lm = T)
In [225]:
tail(n_dataset)
In [226]:
PerformanceAnalytics::chart.Correlation(dplyr::select(n_dataset, dayofyear, obv), histogram=TRUE, pch=".")
In [227]:
# target percentages.
stdf %>%
dplyr::group_by(label_twopercent) %>%
dplyr::summarise(count = n() / nrow(.) )
If I guessed YES that the stock will hit a high of n% for each observation, I would be right class1 % of the time. I really care about guessing YES and the stock not hitting my target. So I will focus on negative predictive value.
So the really question I want answered is: When I predict a YES, how accurate am I?
In [228]:
#clip the first 50 records, since they have na's
#clip the last 15, since can't be used to train
dim(stdf)
In [229]:
nd <- nrow(stdf) #number of rows
st <- nrow(stdf) - 15 # number of rows minus 15
st
nd
exc <- dplyr::slice(stdf, 1:st) # subset to train on all except last 15
exend <- dplyr::slice(stdf, st+1:nd) # subset of last 15 records to score at the end. +1 is so it doesn't include the last row of the exc dataset
dim(exc)
tail(exc)
tail(exend, n = 15)
In [230]:
# get rid of any rows with na's in the training set
exc<-na.omit(exc)
dim(exc)
head(exc)
In [231]:
# Split out validation dataset
validation_index <- createDataPartition(exc$label_twopercent, p = 0.67, list = FALSE)
validation <- exc[-validation_index,]
sktrain <- exc[validation_index,]
In [ ]:
In [232]:
# set up k-fold cross validation and metric
control <- trainControl(method = "cv", number = 10, sampling = "up"
, summaryFunction=twoClassSummary
, classProbs = TRUE
)
metric <- "ROC"#"Accuracy"
In [233]:
formula <- label_twopercent ~ volume + rsi + PoM50 + dayofweek + dayofmonth + dayofyear + weekofyear + year + MoM20 +
PoM20 + PoM10 + PoM5 + MoM50 + lagEMA20 + EMA20Slope + EMA20SlopePN + lagEMA20SlopePN + EMA20SlopePNchange +
macd + signal + roc + obv + aroonUp + aroonDn + oscillator + runMaxHigh + runMinLow + aroondir + macddir +
MoM10 + MoM550 + MoM520 + lagEMA10 + EMA10Slope + EMA10SlopePN + lagEMA10SlopePN + EMA10SlopePNchange + newhighYN +
newhighndays + runMaxHighndays + runMinLowndays + newlowYN + newlowndays + runMaxHigh/runMinLow + dayofyear/dayofmonth +
dayofyear/dayofweek + obv/dayofyear + obv/signal + dayofyear/signal + obv/roc + obv/EMA10Slope + EMA200 +
EMA100 + PoM200 + PoM100 + MoM100200 + MoM50200 + MoM20200 + lagEMA200 + EMA200Slope + EMA200SlopePN +
lagEMA200SlopePN + EMA200SlopePNchange + lagEMA50 + EMA50Slope + EMA50SlopePN + lagEMA50SlopePN + EMA50SlopePNchange +
volatility + volatility/obv + volatility/dayofyear
In [234]:
# create formula and set for using all the dummies
#labely <- sktrain$label_twopercent
#sktrain <- dplyr::select_if(sktrain, is.numeric)
#sktrain <- cbind(labely, sktrain)
#formula <- labely ~ .
In [235]:
set.seed(13)
fit.c5 <- train(formula, data = sktrain, method = "C5.0", preProcess = c('zv','medianImpute','BoxCox'), metric = metric, trControl = control, na.action = na.pass)
# GLMNET
set.seed(13)
#fit.glmnet <- train(formula, data = sktrain, method = "glmnet", preProcess = c('zv','medianImpute','BoxCox'), metric = metric, trControl = control, na.action = na.pass)
# KNN
set.seed(13)
#fit.knn <- train(formula, data = sktrain, method = "knn", preProcess = c('zv', "center", "scale",'medianImpute','BoxCox'), metric = metric, trControl = control, na.action = na.pass)
# SVM
set.seed(13)
#fit.svm <- train(formula, data = sktrain, method = "svmRadial", preProcess = c('center','scale','zv','medianImpute','BoxCox'), metric = metric, trControl = control, na.action = na.pass)
#xgb
set.seed(13); fit.xgb <- train(formula, data = sktrain, method = "xgbTree", preProcess =c('zv','medianImpute','BoxCox'), metric = metric, trControl = control, na.action = na.pass)
#xgblinear
set.seed(13); fit.xgbLinear <- train(formula, data = sktrain, method = "xgbLinear", preProcess =c('zv','medianImpute','BoxCox'), metric = metric, trControl = control, na.action = na.pass)
In [236]:
fit.c5
#fit.svm
In [237]:
# Compare results
results <- resamples(list(
C5 = fit.c5,
#GLMNET = fit.glmnet,
#KNN = fit.knn,
xgb = fit.xgb,
XGBLinear = fit.xgbLinear
#SVM = fit.svm
))
summary(results)
# view plots of model results
bwplot(results)
dotplot(results)
# view variable importancs
#varImp(fit.glmnet)
#varImp(fit.svm)
#varImp(fit.xgb)
#plot(varImp(fit.svm), top = 20)
plot(varImp(fit.xgb), top = 20)
In [238]:
#test predicting power on unseen validation set
validation$prediction <- predict(fit.xgbLinear, newdata = validation, na.action = na.pass)
#Check the accuracy with a confusion matrix
confusionMatrix(validation$prediction, validation$label_twopercent, positive = "class0") #convention is positive class is the rarest one
In [239]:
validation
In [240]:
# tune
xgbgrid = expand.grid(
nrounds = c(2), # Test 4 values for boosting rounds
max_depth = c(5, 10, 15), # Test 2 values for tree depth
eta = c(0.1, 0.01, 0.001, 0.0001), # Test 3 values for learning rate
gamma = c(0,1, 2, 3),
colsample_bytree = c(0.4, 0.7, 1.0),
min_child_weight = c(0.5, 1, 1.5),
subsample = c(0.7)
)
# set up k-fold cross validation and metric
#control <- trainControl(
# method = "cv"
# , number = 10
# , sampling = "up"
#)
#metric <- "Accuracy"
#xgb
#set.seed(13); fit.xgb <- train(formula, data = sktrain, method = "xgbTree", preProcess =c('medianImpute','BoxCox','zv'), metric = metric,
# trControl = control, na.action = na.pass, tuneGrid=xgbgrid)
In [241]:
fit.xgb
In [242]:
c5grid <- expand.grid( .winnow = c(TRUE,FALSE), .trials=c(1,5,10,15,20,25), .model=c("rules","tree"))
set.seed(13)
fit.c5 <- train(formula, data = sktrain, method = "C5.0", preProcess = c('zv','medianImpute','BoxCox'), metric = metric,
trControl = control, na.action = na.pass, tuneGrid=c5grid)
In [243]:
fit.c5
In [244]:
# set up k-fold cross validation and metric
control <- trainControl(
method = "cv"
, number = 10
, sampling = "up"
, summaryFunction=twoClassSummary
, classProbs = TRUE
, search='random'
)
#metric <- "Accuracy"
set.seed(13)
fit.c5 <- train(formula, data = sktrain, method = "C5.0", preProcess = c('zv','medianImpute','BoxCox')
, metric = metric
, trControl = control
, na.action = na.pass
, tuneLengh=20)
In [245]:
fit.c5
In [246]:
plot(varImp(fit.c5), top = 50)
In [247]:
plot(varImp(fit.xgb), top = 50)
In [248]:
#test predicting power on unseen validation set
xgbtuneprediction <- predict(fit.c5, newdata = validation, na.action = na.pass)
#Check the accuracy with a confusion matrix
confusionMatrix(xgbtuneprediction, validation$label_twopercent
, positive = "class0" #convention is positive class is the rarest one
)
In [249]:
#predict on latest dates
exend$prediction <- predict(fit.c5, newdata = exend, na.action = na.pass)
predictionprob <- predict(fit.c5, newdata = exend, na.action = na.pass, type = "prob")
exend <- cbind(exend, predictionprob)
In [250]:
glimpse(exend)
In [251]:
# buy at open. set a good-til-cancel sell limit order for 2% above buy price. ex: buy_price * 1.02
dplyr::select(exend, date, prediction, 'class0', 'class1')
In [252]:
# scatter plot
# http://ggplot2.tidyverse.org/reference/geom_point.html
#p <- ggplot(validation, aes(percentchangeMax15, runMaxHigh, shape = factor(label_twopercent)))
#p + geom_point(aes(colour = factor(label_twopercent)), size = 4) +
# geom_point(colour = "grey90", size = 1.5)
In [253]:
#test predicting power on unseen validation set
xgbtunepredictionprob <- predict(fit.xgb, newdata = validation, na.action = na.pass, type = "prob")
In [254]:
head(xgbtunepredictionprob)
head(xgbtunepredictionprob[,"class1"])
In [255]:
rocCurve <- pROC::roc(response = validation$label_twopercent, predictor = xgbtunepredictionprob[,"class1"]
#, levels = rev(levels(validation$label_twopercent))
)
pROC::auc(rocCurve)
plot(rocCurve, legacy.axes = TRUE)
In [256]:
# i want to decrease my type II (false negatives) errors at the expense of my type I (false positive) errors
# meaning: when I get a buy signal of 1, I want to minimize the chance that it's incorrect. I don't care so much about
# the incorrectness of when I'm not getting into a trade.
In [257]:
# scatter plot
# http://ggplot2.tidyverse.org/reference/geom_point.html
p <- ggplot(validation, aes(percentchangeMax15, runMaxHigh, shape = factor(prediction)))
p + geom_point(aes(colour = factor(prediction)), size = 4) +
geom_point(colour = "grey90", size = 1.5)
In [ ]:
In [258]:
d <- cbind(validation, pred = xgbtuneprediction)
d
In [259]:
options(tibble.width = Inf)
tb <- dplyr::select(d,
everything()
) %>%
dplyr::filter(., label_twopercent == '0' & pred == '1')
tb
In [ ]:
In [ ]: